#endif
-void domain_relinquish_memory(struct domain *d)
+
+static void relinquish_list(struct domain *d, struct list_head *list)
{
- struct list_head *ent, *tmp;
+ struct list_head *ent;
struct pfn_info *page;
unsigned long x, y;
- /* Ensure that noone is running over the dead domain's page tables. */
- synchronise_pagetables(~0UL);
-
- /* Exit shadow mode before deconstructing final guest page table. */
- shadow_mode_disable(d);
-
- /* Drop the in-use reference to the page-table base. */
- if ( pagetable_val(d->mm.pagetable) != 0 )
- put_page_and_type(&frame_table[pagetable_val(d->mm.pagetable) >>
- PAGE_SHIFT]);
-
- /*
- * Relinquish GDT mappings. No need for explicit unmapping of the LDT as
- * it automatically gets squashed when the guest's mappings go away.
- */
- destroy_gdt(d);
-
/* Use a recursive lock, as we may enter 'free_domheap_page'. */
spin_lock_recursive(&d->page_alloc_lock);
- /* Relinquish Xen-heap pages. */
- list_for_each_safe ( ent, tmp, &d->xenpage_list )
- {
- page = list_entry(ent, struct pfn_info, list);
-
- if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
- put_page(page);
- }
-
- /* Relinquish all pages on the domain's allocation list. */
- list_for_each_safe ( ent, tmp, &d->page_list )
+ /*
+ * Careful! Any time we might decrement a page's reference count we
+ * might invalidate our page pointer or our pointer into the page list.
+ * In such cases we have to exit the current iteration of the loop and
+ * start back at the beginning of the list. We are guaranteed to make
+ * forward progress because nothign will get added to the list (the domain
+ * is dying) and no pages will become pinned after we unpin them.
+ */
+ ent = list->next;
+ while ( ent != list )
{
page = list_entry(ent, struct pfn_info, list);
- if ( test_and_clear_bit(_PGC_guest_pinned, &page->count_info) )
+ if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
+ {
+ /* NB. Check the allocation pin /before/ put_page_and_type()! */
+ if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
+ put_page(page);
put_page_and_type(page);
+ /* May have lost our place in the list - start over. */
+ ent = list->next;
+ continue;
+ }
if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
+ {
put_page(page);
+ /* May have lost our place in the list - start over. */
+ ent = list->next;
+ continue;
+ }
/*
* Forcibly invalidate base page tables at this point to break circular
x = y;
if ( likely((x & (PGT_type_mask|PGT_validated)) !=
(PGT_base_page_table|PGT_validated)) )
+ {
+ /*
+ * We have done no work on this iteration, so it is safe
+ * to move on to the next page in the list.
+ */
+ ent = ent->next;
break;
+ }
y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
if ( likely(y == x) )
+ {
free_page_type(page, PGT_base_page_table);
+ /* May have lost our place in the list - start over. */
+ ent = list->next;
+ }
}
while ( unlikely(y != x) );
}
spin_unlock_recursive(&d->page_alloc_lock);
+
+ /*
+ * Another CPU may have raced us to free some pages. Wait for those
+ * to trickle out now that we have released the lock.
+ */
+ while ( !list_empty(list) )
+ {
+ smp_mb();
+ cpu_relax();
+ }
+}
+
+
+void domain_relinquish_memory(struct domain *d)
+{
+ /* Ensure that noone is running over the dead domain's page tables. */
+ synchronise_pagetables(~0UL);
+
+ /* Exit shadow mode before deconstructing final guest page table. */
+ shadow_mode_disable(d);
+
+ /* Drop the in-use reference to the page-table base. */
+ if ( pagetable_val(d->mm.pagetable) != 0 )
+ put_page_and_type(&frame_table[pagetable_val(d->mm.pagetable) >>
+ PAGE_SHIFT]);
+
+ /*
+ * Relinquish GDT mappings. No need for explicit unmapping of the LDT as
+ * it automatically gets squashed when the guest's mappings go away.
+ */
+ destroy_gdt(d);
+
+ /* Relinquish every page of memory. */
+ relinquish_list(d, &d->xenpage_list);
+ relinquish_list(d, &d->page_list);
}
/* Get another ref to L2 page so that it can be pinned. */
if ( !get_page_and_type(page, p, PGT_l2_page_table) )
BUG();
- set_bit(_PGC_guest_pinned, &page->count_info);
+ set_bit(_PGT_pinned, &page->u.inuse.type_info);
}
else
{
/* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
static int
get_page_from_l2e(
- l2_pgentry_t l2e, unsigned long pfn, struct domain *d, unsigned long va_idx)
+ l2_pgentry_t l2e, unsigned long pfn,
+ struct domain *d, unsigned long va_idx)
{
int rc;
rc = get_page_and_type_from_pagenr(
l2_pgentry_to_pagenr(l2e),
- PGT_l1_page_table | (va_idx<<PGT_va_shift), d);
+ PGT_l1_page_table | (va_idx<<PGT_va_shift), d);
if ( unlikely(!rc) )
return get_linear_pagetable(l2e, pfn, d);
return update_l2e(pl2e, ol2e, nl2e);
if ( unlikely(!get_page_from_l2e(nl2e, pfn, current,
- ((unsigned long)
- pl2e & ~PAGE_MASK) >> 2 )) )
+ ((unsigned long)pl2e &
+ ~PAGE_MASK) >> 2)) )
return 0;
if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
{
case MMUEXT_PIN_L1_TABLE:
case MMUEXT_PIN_L2_TABLE:
-
- /* When we pin an L1 page we now insist that the va
- backpointer (used for writable page tables) must still be
- mutable. This is an additional restriction even for guests
- that don't use writable page tables, but I don't think it
- will break anything as guests typically pin pages before
- they are used, hence they'll still be mutable. */
-
+ /*
+ * We insist that, if you pin an L1 page, it's the first thing that
+ * you do to it. This is because we require the backptr to still be
+ * mutable. This assumption seems safe.
+ */
okay = get_page_and_type_from_pagenr(
pfn,
((cmd==MMUEXT_PIN_L2_TABLE) ?
- PGT_l2_page_table : (PGT_l1_page_table | PGT_va_mutable) ) ,
+ PGT_l2_page_table : (PGT_l1_page_table|PGT_va_mutable)),
FOREIGNDOM);
if ( unlikely(!okay) )
break;
}
- if ( unlikely(test_and_set_bit(_PGC_guest_pinned,
- &page->count_info)) )
+ if ( unlikely(test_and_set_bit(_PGT_pinned,
+ &page->u.inuse.type_info)) )
{
MEM_LOG("Pfn %08lx already pinned", pfn);
put_page_and_type(page);
MEM_LOG("Page %08lx bad domain (dom=%p)",
ptr, page->u.inuse.domain);
}
- else if ( likely(test_and_clear_bit(_PGC_guest_pinned,
- &page->count_info)) )
+ else if ( likely(test_and_clear_bit(_PGT_pinned,
+ &page->u.inuse.type_info)) )
{
put_page_and_type(page);
put_page(page);
spin_lock(&e->page_alloc_lock);
- /* Check that 'e' will accept the page and has reservation headroom. */
+ /*
+ * Check that 'e' will accept the page and has reservation headroom.
+ * Also, a domain mustn't have PGC_allocated pages when it is dying.
+ */
ASSERT(e->tot_pages <= e->max_pages);
- if ( unlikely(e->tot_pages == e->max_pages) ||
+ if ( unlikely(test_bit(DF_DYING, &e->flags)) ||
+ unlikely(e->tot_pages == e->max_pages) ||
unlikely(!gnttab_prepare_for_transfer(e, d, gntref)) )
{
MEM_LOG("Transferee has no reservation headroom (%d,%d), or "
- "provided a bad grant ref.\n", e->tot_pages, e->max_pages);
+ "provided a bad grant ref, or is dying (%08x).\n",
+ e->tot_pages, e->max_pages, e->flags);
spin_unlock(&e->page_alloc_lock);
put_domain(e);
okay = 0;
unsigned long prev_spfn = 0;
l1_pgentry_t *prev_spl1e = 0;
struct domain *d = current;
+ u32 type_info;
perfc_incrc(calls_to_mmu_update);
perfc_addc(num_page_updates, count);
}
page = &frame_table[pfn];
- switch ( (page->u.inuse.type_info & PGT_type_mask) )
+ switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
{
case PGT_l1_page_table:
- if ( likely(passive_get_page_type(page, PGT_l1_page_table)) )
+ if ( likely(get_page_type(
+ page, type_info & (PGT_type_mask|PGT_va_mask))) )
{
okay = mod_l1_entry((l1_pgentry_t *)va,
mk_l1_pgentry(req.val));
[ptwr_info[cpu].writable_l1>>PAGE_SHIFT];
#ifdef PTWR_TRACK_DOMAIN
- if (ptwr_domain[cpu] != get_current()->domain)
+ if (ptwr_domain[cpu] != current->domain)
printk("ptwr_reconnect_disconnected domain mismatch %d != %d\n",
- ptwr_domain[cpu], get_current()->domain);
+ ptwr_domain[cpu], current->domain);
#endif
- PTWR_PRINTK(("[A] page fault in disconnected space: addr %08lx space %08lx\n",
+ PTWR_PRINTK(("[A] page fault in disconn space: addr %08lx space %08lx\n",
addr, ptwr_info[cpu].disconnected << L2_PAGETABLE_SHIFT));
pl2e = &linear_l2_table[ptwr_info[cpu].disconnected];
int i, idx;
#ifdef PTWR_TRACK_DOMAIN
- if (ptwr_info[cpu].domain != get_current()->domain)
+ if (ptwr_info[cpu].domain != current->domain)
printk("ptwr_flush_inactive domain mismatch %d != %d\n",
- ptwr_info[cpu].domain, get_current()->domain);
+ ptwr_info[cpu].domain, current->domain);
#endif
#if 0
{
if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table )
{
#ifdef PTWR_TRACK_DOMAIN
- if ( ptwr_info[cpu].domain != get_current()->domain )
+ if ( ptwr_info[cpu].domain != current->domain )
printk("ptwr_do_page_fault domain mismatch %d != %d\n",
- ptwr_info[cpu].domain, get_current()->domain);
+ ptwr_info[cpu].domain, current->domain);
#endif
pl2e = &linear_l2_table[(page->u.inuse.type_info &
PGT_va_mask) >> PGT_va_shift];
break;
}
- if ( page->count_info & PGC_guest_pinned )
+ if ( page->u.inuse.type_info & PGT_pinned )
type |= LPINTAB;
l_arr[j] |= type;
put_page(page);
return i;
}
- if ( test_and_clear_bit(_PGC_guest_pinned, &page->count_info) )
+ if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
put_page_and_type(page);
if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
spin_lock(&d->page_alloc_lock);
- if ( unlikely((d->tot_pages + (1 << order)) > d->max_pages) )
+ if ( unlikely(test_bit(DF_DYING, &d->flags)) ||
+ unlikely((d->tot_pages + (1 << order)) > d->max_pages) )
{
DPRINTK("Over-allocation for domain %u: %u > %u\n",
d->domain, d->tot_pages + (1 << order), d->max_pages);
+ DPRINTK("...or the domain is dying (%d)\n",
+ !!test_bit(DF_DYING, &d->flags));
spin_unlock(&d->page_alloc_lock);
free_heap_pages(MEMZONE_DOM, pg, order);
return NULL;
if ( unlikely(IS_XEN_HEAP_FRAME(pg)) )
{
+ /* NB. May recursively lock from domain_relinquish_memory(). */
spin_lock_recursive(&d->page_alloc_lock);
for ( i = 0; i < (1 << order); i++ )
cleanup_writable_pagetable(
prev, PTWR_CLEANUP_ACTIVE | PTWR_CLEANUP_INACTIVE);
-#ifdef PTWR_TRACK_DOMAIN
- {
- extern domid_t ptwr_domain[];
- int cpu = smp_processor_id();
- if (ptwr_domain[cpu] != prev->domain)
- printk("switch_to domain mismatch %d != %d\n",
- ptwr_domain[cpu], prev->domain);
- ptwr_domain[cpu] = next->domain;
- if (ptwr_disconnected[cpu] != ENTRIES_PER_L2_PAGETABLE ||
- ptwr_writable_idx[cpu])
- printk("switch_to ptwr dirty!!!\n");
- }
-#endif
-
perfc_incrc(sched_ctx);
#if defined(WAKE_HISTO)
/* Has this page been validated for use as its current type? */
#define _PGT_validated 28
#define PGT_validated (1<<_PGT_validated)
- /* 10-bit most significant bits of va address if used as l1 page table */
-#define PGT_va_shift 18
+ /* Owning guest has pinned this page to its current type? */
+#define _PGT_pinned 27
+#define PGT_pinned (1<<_PGT_pinned)
+ /* The 10 most significant bits of virt address if this is a page table. */
+#define PGT_va_shift 17
#define PGT_va_mask (((1<<10)-1)<<PGT_va_shift)
-#define PGT_va_mutable PGT_va_mask /* va backpointer is still mutable */
- /* 18-bit count of uses of this frame as its current type. */
-#define PGT_count_mask ((1<<18)-1)
+#define PGT_va_mutable PGT_va_mask /* va backpointer is mutable? */
+ /* 17-bit count of uses of this frame as its current type. */
+#define PGT_count_mask ((1<<17)-1)
/* For safety, force a TLB flush when this page's type changes. */
#define _PGC_tlb_flush_on_type_change 31
#define PGC_tlb_flush_on_type_change (1<<_PGC_tlb_flush_on_type_change)
- /* Owning guest has pinned this page to its current type? */
-#define _PGC_guest_pinned 30
-#define PGC_guest_pinned (1<<_PGC_guest_pinned)
/* Cleared when the owning guest 'frees' this page. */
-#define _PGC_allocated 29
+#define _PGC_allocated 30
#define PGC_allocated (1<<_PGC_allocated)
/* This bit is always set, guaranteeing that the count word is never zero. */
-#define _PGC_always_set 28
+#define _PGC_always_set 29
#define PGC_always_set (1<<_PGC_always_set)
- /* 27-bit count of references to this frame. */
-#define PGC_count_mask ((1<<28)-1)
+ /* 29-bit count of references to this frame. */
+#define PGC_count_mask ((1<<29)-1)
/* We trust the slab allocator in slab.c, and our use of it. */
#define PageSlab(page) (1)
nx &= ~PGT_validated;
}
}
- else if ( unlikely( ((nx & PGT_count_mask) == 1) &&
- test_bit(_PGC_guest_pinned, &page->count_info)) )
+ else if ( unlikely((nx & (PGT_pinned | PGT_count_mask)) ==
+ (PGT_pinned | 1)) )
{
- /* if the page is pinned, but we're dropping the last reference
- then make the va backpointer mutable again */
+ /* Page is now only pinned. Make the back pointer mutable again. */
nx |= PGT_va_mutable;
}
}
nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated);
nx |= type;
/* No extra validation needed for writable pages. */
- if ( (type & PGT_type_mask) == PGT_writable_page )
+ if ( type == PGT_writable_page )
nx |= PGT_validated;
}
}
- else if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
- {
- DPRINTK("Unexpected type (saw %08x != exp %08x) for pfn %08lx\n",
- x & PGT_type_mask, type, page_to_pfn(page));
- return 0;
- }
- else if ( (x & PGT_va_mask) == PGT_va_mutable )
- {
- /* The va_backpointer is currently mutable, hence we update it. */
- nx &= ~PGT_va_mask;
- nx |= type; /* we know the actual type is correct */
- }
- else if ( unlikely((x & PGT_va_mask) != (type & PGT_va_mask) ) )
+ else if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) )
{
- /* The va backpointer wasn't mutable, and is different :-( */
- DPRINTK("Unexpected va backpointer (saw %08x != exp %08x) for pfn %08lx\n",
- x, type, page_to_pfn(page));
- return 0;
+ if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
+ {
+ DPRINTK("Bad type (saw %08x != exp %08x) for pfn %08lx\n",
+ x & PGT_type_mask, type, page_to_pfn(page));
+ return 0;
+ }
+ else if ( (x & PGT_va_mask) == PGT_va_mutable )
+ {
+ /* The va backpointer is mutable, hence we update it. */
+ nx &= ~PGT_va_mask;
+ nx |= type; /* we know the actual type is correct */
+ }
+ else if ( unlikely((x & PGT_va_mask) != (type & PGT_va_mask)) )
+ {
+ /* The va backpointer wasn't mutable, and is different. */
+ DPRINTK("Unexpected va backpointer (saw %08x != exp %08x)"
+ " for pfn %08lx\n", x, type, page_to_pfn(page));
+ return 0;
+ }
}
else if ( unlikely(!(x & PGT_validated)) )
{
/* Someone else is updating validation of this page. Wait... */
- while ( (y = page->u.inuse.type_info) != x )
+ while ( (y = page->u.inuse.type_info) == x )
{
rep_nop();
barrier();
return 1;
}
-/* This 'passive' version of get_page_type doesn't attempt to validate
-the page, but just checks the type and increments the type count. The
-function is called while doing a NORMAL_PT_UPDATE of an entry in an L1
-page table: We want to 'lock' the page for the brief beriod while
-we're doing the update, but we're not actually linking it in to a
-pagetable. */
-
-static inline int passive_get_page_type(struct pfn_info *page, u32 type)
-{
- u32 nx, x, y = page->u.inuse.type_info;
- again:
- do {
- x = y;
- nx = x + 1;
- if ( unlikely((nx & PGT_count_mask) == 0) )
- {
- DPRINTK("Type count overflow on pfn %08lx\n", page_to_pfn(page));
- return 0;
- }
- else if ( unlikely((x & PGT_count_mask) == 0) )
- {
- if ( (x & (PGT_type_mask|PGT_va_mask)) != type )
- {
- nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated);
- nx |= type;
- }
- }
- else if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
- {
- DPRINTK("Unexpected type (saw %08x != exp %08x) for pfn %08lx\n",
- x & PGT_type_mask, type, page_to_pfn(page));
- return 0;
- }
- else if ( unlikely(!(x & PGT_validated)) )
- {
- /* Someone else is updating validation of this page. Wait... */
- while ( (y = page->u.inuse.type_info) != x )
- {
- rep_nop();
- barrier();
- }
- goto again;
- }
- }
- while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
-
- return 1;
-}
-
static inline void put_page_and_type(struct pfn_info *page)
{
* This makes sure that old versions of dom0 tools will stop working in a
* well-defined way (rather than crashing the machine, for instance).
*/
-#define DOM0_INTERFACE_VERSION 0xAAAA0013
+#define DOM0_INTERFACE_VERSION 0xAAAA0014
#define MAX_DOMAIN_NAME 16
#define MMUEXT_PIN_L2_TABLE 1 /* ptr = MA of frame to pin */
#define MMUEXT_PIN_L3_TABLE 2 /* ptr = MA of frame to pin */
#define MMUEXT_PIN_L4_TABLE 3 /* ptr = MA of frame to pin */
-#define MMUEXT_UNPIN_TABLE 1 /* ptr = MA of frame to unpin */
-#define MMUEXT_NEW_BASEPTR 2 /* ptr = MA of new pagetable base */
-#define MMUEXT_TLB_FLUSH 3 /* ptr = NULL */
-#define MMUEXT_INVLPG 4 /* ptr = VA to invalidate */
-#define MMUEXT_FLUSH_CACHE 5
-#define MMUEXT_SET_LDT 6 /* ptr = VA of table; val = # entries */
-#define MMUEXT_SET_FOREIGNDOM 7 /* val[31:16] = dom */
-#define MMUEXT_CLEAR_FOREIGNDOM 8
-#define MMUEXT_TRANSFER_PAGE 9 /* ptr = MA of frame; val[31:16] = dom */
-#define MMUEXT_REASSIGN_PAGE 10
+#define MMUEXT_UNPIN_TABLE 4 /* ptr = MA of frame to unpin */
+#define MMUEXT_NEW_BASEPTR 5 /* ptr = MA of new pagetable base */
+#define MMUEXT_TLB_FLUSH 6 /* ptr = NULL */
+#define MMUEXT_INVLPG 7 /* ptr = VA to invalidate */
+#define MMUEXT_FLUSH_CACHE 8
+#define MMUEXT_SET_LDT 9 /* ptr = VA of table; val = # entries */
+#define MMUEXT_SET_FOREIGNDOM 10 /* val[31:16] = dom */
+#define MMUEXT_CLEAR_FOREIGNDOM 11
+#define MMUEXT_TRANSFER_PAGE 12 /* ptr = MA of frame; val[31:16] = dom */
+#define MMUEXT_REASSIGN_PAGE 13
#define MMUEXT_CMD_MASK 255
#define MMUEXT_CMD_SHIFT 8